read_student_data<-function(years){
  
  setwd(wd_data_final)
  #Load student data_student
  for (i in years){
    print(i)
    data_temp<-readRDS(paste("data_merged_complete_town_siiir_codes_",i,".rds",sep="")) 
    data_temp$an<-i
    
    if (i==years[1]){
      data_student<-data_temp
    } else {
      data_student<-bind_rows(data_student,data_temp)
    }
    rm(data_temp)
    gc()
  }
  

    
    Sys.setlocale(locale="Romanian.utf8")
    data_student$specializare_bac2<-trimws(data_student$specializare_bac)
    data_student$specializare_bac2<-toupper(data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("Î","I",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("Ă","A",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("Ţ","T",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("Ț","T",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("Ș","S",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("Ț","T",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("\\(","",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("\\)","",data_student$specializare_bac2)
    data_student$specializare_bac2<-gsub("[0-9]","",data_student$specializare_bac2,perl=T)
    data_student$specializare_bac2<-trimws(data_student$specializare_bac2)
    Sys.setlocale(locale="English.utf8")
    
    data_bkp0<-data_student
    
    

    data_student<-data_student %>%
      mutate(media=case_when(
        is.na(media) & rezultat %in% c('respins','Respins','RESPINS') & !is.na(disciplina_alegere_celelalte_arii_culiculare_final)  ~ 
          (lb_romana_final+disciplina_obligatorie_scris_final+disciplina_alegere_aria_culiculara_final+disciplina_alegere_celelalte_arii_culiculare_final)/4,
        is.na(media) & rezultat %in% c('respins','Respins','RESPINS') & is.na(disciplina_alegere_celelalte_arii_culiculare_final)  ~ 
          (lb_romana_final+disciplina_obligatorie_scris_final+disciplina_alegere_aria_culiculara_final)/3,
        TRUE ~ media
      ))
    
    
    setwd(wd_data_raw_other)
    #add SIRUTA 2-3 mapping
    siruta<-read.xlsx("SIRUTA2-3.xlsx") %>% filter(MED==3) %>% select(SIRUTA,SIRSUP) 
    data_student<-data_student %>% 
      base::merge(siruta,by.x=c("Cod_SIRUTA_hs_bac"),by.y=c("SIRUTA"),all.x=T) %>%
      mutate(SIRSUP=ifelse(is.na(SIRSUP),Cod_SIRUTA_hs_bac,SIRSUP)) %>%
      rename(Cod_SIRUTA2_hs_bac=SIRSUP)
    data_student<-data_student %>%
      base::merge(siruta,by.x=c("Cod_SIRUTA_hs_adm"),by.y=c("SIRUTA"),all.x=T) %>%
      mutate(SIRSUP=ifelse(is.na(SIRSUP),Cod_SIRUTA_hs_adm,SIRSUP)) %>%
      rename(Cod_SIRUTA2_hs_adm=SIRSUP) 
    data_student<-data_student %>%
      base::merge(siruta,by.x=c("Cod_SIRUTA_ms_adm"),by.y=c("SIRUTA"),all.x=T) %>%
      mutate(SIRSUP=ifelse(is.na(SIRSUP),Cod_SIRUTA_ms_adm,SIRSUP)) %>%
      rename(Cod_SIRUTA2_ms_adm=SIRSUP) 
    
    ####fix SIIIR Codes and town_hs
    #find out frequencies of SIRUTA and town_hs to figure out conflicts
    data_locality_frequency<-data_student %>%
      group_by(judet_bac,Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac,town_hs_bac) %>%
      summarise(n=n()) %>%
      arrange(judet_bac,town_hs_bac,-n) %>%
      group_by(Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac) %>%
      mutate(p_siruta=n/sum(n)) %>%
      group_by(judet_bac,town_hs_bac) %>%
      mutate(p_town=n/sum(n))
    
    #replace wrong SIRUTA
    data_replace_siruta<-data_locality_frequency %>%
      group_by(judet_bac,town_hs_bac) %>%
      filter(n()>1 & town_hs_bac!='') %>%
      group_by(judet_bac,town_hs_bac) %>%
      arrange(judet_bac,town_hs_bac,-p_siruta) %>%
      slice(1) %>%
      filter(judet_bac!='BUCURESTI') %>% 
      select(-n,-p_siruta,-p_town) %>%
      ungroup()
    
    data_student<-data_student %>% 
      ungroup() %>%
      base::merge(data_replace_siruta,
                 by=c("judet_bac","town_hs_bac"),
                 all.x=T,suffixes=c("","_replace"))
    data_student<-data_student %>%
      mutate(Cod_SIRUTA_hs_bac=ifelse(is.na(Cod_SIRUTA_hs_bac_replace),Cod_SIRUTA_hs_bac,Cod_SIRUTA_hs_bac_replace)) %>%
      mutate(Cod_SIRUTA2_hs_bac=ifelse(is.na(Cod_SIRUTA2_hs_bac_replace),Cod_SIRUTA2_hs_bac,Cod_SIRUTA2_hs_bac_replace)) %>%
      select(-Cod_SIRUTA_hs_bac_replace,-Cod_SIRUTA2_hs_bac_replace)
    
    #replace wrong town
    data_locality_frequency<-data_student %>%
      group_by(judet_bac,Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac,town_hs_bac) %>%
      summarise(n=n()) %>%
      arrange(judet_bac,town_hs_bac,-n) %>%
      group_by(Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac) %>%
      mutate(p_siruta=n/sum(n)) %>%
      group_by(judet_bac,town_hs_bac) %>%
      mutate(p_town=n/sum(n)) %>%
      ungroup()
    
    data_replace_town<-data_locality_frequency %>%
      group_by(Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac) %>%
      filter(n()>1 & !is.na(Cod_SIRUTA_hs_bac)) %>%
      group_by(Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac) %>%
      arrange(Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac,-p_siruta) %>%
      slice(1) %>%
      filter(judet_bac!='BUCURESTI') %>% 
      select(-n,-p_siruta,-p_town) %>%
      ungroup()
    
    data_student<-data_student %>% 
      ungroup() %>%
      base::merge(data_replace_town,
                  by=c("Cod_SIRUTA_hs_bac","Cod_SIRUTA2_hs_bac"),
                  all.x=T,suffixes=c("","_replace"))
    data_student<-data_student %>%
      mutate(town_hs_bac=ifelse(is.na(town_hs_bac_replace),town_hs_bac,town_hs_bac_replace)) %>%
      select(-town_hs_bac_replace,-judet_bac_replace)
    
    data_bkp<-data_student
    
    #add codes for high schools with missing SIIIR codes
    data_missing_siruta<-data_student %>%
      group_by(judet_bac,Cod_SIIIR_hs_bac,unitate_de_invatamant,school_harmonized) %>%
      summarise() %>%
      filter(!is.na(unitate_de_invatamant) & is.na(Cod_SIIIR_hs_bac) & unitate_de_invatamant!='HS NOT MATCHED') 
    
    data_recover_siruta<-data_student %>%
      group_by(judet_bac,Cod_SIIIR_hs_bac,Cod_SIRUES_hs_bac,Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac,school_harmonized) %>%
      summarise(n=n()) %>%
      filter(!is.na(school_harmonized) & !is.na(Cod_SIIIR_hs_bac)) %>%
      semi_join(data_missing_siruta,by=c("judet_bac","school_harmonized")) %>%
      group_by(judet_bac,school_harmonized) %>%
      mutate(n=sum(n)) %>%
      arrange(judet_bac,school_harmonized,-n) %>%
      slice(1) %>%
      ungroup()
      
  
    data_student<-data_student %>% 
      ungroup() %>%
      base::merge(data_recover_siruta,
                  by=c("judet_bac","school_harmonized"),
                  all.x=T,suffixes=c("","_replace"))
    data_student<-data_student %>%
      mutate(Cod_SIIIR_hs_bac=ifelse(is.na(Cod_SIIIR_hs_bac_replace),Cod_SIIIR_hs_bac,Cod_SIIIR_hs_bac_replace)) %>%
      mutate(Cod_SIRUES_hs_bac=ifelse(is.na(Cod_SIRUES_hs_bac_replace),Cod_SIRUES_hs_bac,Cod_SIRUES_hs_bac_replace)) %>%
      mutate(Cod_SIRUTA_hs_bac=ifelse(is.na(Cod_SIRUTA_hs_bac_replace),Cod_SIRUTA_hs_bac,Cod_SIRUTA_hs_bac_replace)) %>%
      mutate(Cod_SIRUTA2_hs_bac=ifelse(is.na(Cod_SIRUTA2_hs_bac_replace),Cod_SIRUTA2_hs_bac,Cod_SIRUTA2_hs_bac_replace)) %>%
      select(-Cod_SIIIR_hs_bac_replace,-Cod_SIRUES_hs_bac_replace,-Cod_SIRUTA_hs_bac_replace,-Cod_SIRUTA2_hs_bac_replace)
      
    rm(data_missing_siruta,data_recover_siruta)
    
    data_bkp<-data_student
    
    #add new siiir code for those without any
    data_missing_siruta<-data_student %>%
      group_by(judet_bac,Cod_SIIIR_hs_bac,Cod_SIRUES_hs_bac,school_harmonized) %>%
      summarise() %>%
      filter(!is.na(school_harmonized) & is.na(Cod_SIIIR_hs_bac) & school_harmonized!='HS NOT MATCHED') %>%
      mutate(Cod_SIIIR_hs_bac=paste0("A-",school_harmonized)) %>%
      mutate(Cod_SIRUES_hs_bac=paste0("A-",school_harmonized)) %>%
      ungroup()
    
    data_student<-data_student %>% 
      ungroup() %>%
      base::merge(data_missing_siruta,
                  by=c("judet_bac","school_harmonized"),
                  all.x=T,suffixes=c("","_replace"))
    data_student<-data_student %>%
      mutate(Cod_SIIIR_hs_bac=ifelse(is.na(Cod_SIIIR_hs_bac_replace),Cod_SIIIR_hs_bac,Cod_SIIIR_hs_bac_replace)) %>%
      mutate(Cod_SIRUES_hs_bac=ifelse(is.na(Cod_SIRUES_hs_bac_replace),Cod_SIRUES_hs_bac,Cod_SIRUES_hs_bac_replace)) %>%
      select(-Cod_SIIIR_hs_bac_replace,-Cod_SIRUES_hs_bac_replace)
    
    data_bkp<-data_student
    
    rm(data_missing_siruta)
    
    #add SIIIR and SIRUTA codes for admission high schools
    data_admitted_school<-data_student %>%
      filter(!is.na(liceu_repartizat) & !is.na(unitate_de_invatamant) & school_harmonized!="HS NOT MATCHED") %>%
      group_by(judet_adm,judet_bac,liceu_repartizat,unitate_de_invatamant,an,Cod_SIRUTA_hs_bac,Cod_SIRUTA2_hs_bac,Cod_SIIIR_hs_bac,Cod_SIRUES_hs_bac,town_hs_bac) %>%
      summarise(n=n()) %>%
      arrange(judet_adm,liceu_repartizat,an,-n) %>%
      group_by(judet_adm,liceu_repartizat,an) %>%
      mutate(p=n/sum(n)) %>%
      slice(1) %>%
      filter((p>0.8 & n>5) | n>30)%>%
      select(-judet_bac,unitate_de_invatamant,-p,-n) %>%
      ungroup()
    

    data_student<-data_student %>%
      base::merge(data_admitted_school,
                  by=c("an","judet_adm","liceu_repartizat"),suffixes=c("","_replace"),all.x=T)
    data_student<-data_student %>%
      mutate(town_hs_adm=town_hs_bac_replace) %>%
      mutate(Cod_SIIIR_hs_adm=ifelse(is.na(Cod_SIIIR_hs_bac_replace),Cod_SIIIR_hs_adm,Cod_SIIIR_hs_bac_replace)) %>%
      mutate(Cod_SIRUES_hs_adm=ifelse(is.na(Cod_SIRUES_hs_bac_replace),Cod_SIRUES_hs_adm,Cod_SIRUES_hs_bac_replace)) %>%
      mutate(Cod_SIRUTA_hs_adm=ifelse(is.na(Cod_SIRUTA_hs_bac_replace),Cod_SIRUTA_hs_adm,Cod_SIRUTA_hs_bac_replace)) %>%
      mutate(Cod_SIRUTA2_hs_adm=ifelse(is.na(Cod_SIRUTA2_hs_bac_replace),Cod_SIRUTA2_hs_adm,Cod_SIRUTA2_hs_bac_replace)) %>%
      select(-matches("_replace")) 
    

      
      
      
    #Towns
    data_student$town<-with(data_student, interaction(town_hs_bac,  judet_bac))
    
    
    #Function to calculate percentiles
    perc.rank <- function(x) {
      y<-rank(x)/length(x)
      return(y)}
    
    #get perc scores including unmatched and failing students
    data_student<-data_student %>%
      mutate(media_0=case_when(
        is.na(media) ~ 0,
        TRUE ~ media
      ))
    

    
    
    #Get percentile entrance grade and graduation grade, excluding NA's
    data_student$entrance_perc<-NA
    data_student[!is.na(data_student$media_la_admitere),]<-data_student[!is.na(data_student$media_la_admitere),] %>%
      group_by(an) %>%
      mutate(entrance_perc=perc.rank(media_la_admitere)) %>%
      ungroup
    
    data_student$grad_perc<-NA
    data_student[!is.na(data_student$media),]<-data_student[!is.na(data_student$media),] %>%
      group_by(an) %>%
      mutate(grad_perc=perc.rank(media)) %>%
      ungroup
    data_student<-data_student %>%
      mutate(grad_perc_0=ifelse(is.na(grad_perc),0,grad_perc))
    
    data_student$entrance_perc_cls<-NA
    data_student[!is.na(data_student$media_la_admitere),]<-data_student[!is.na(data_student$media_la_admitere),] %>%
      group_by(an,school_harmonized,specializare_bac2) %>%
      mutate(entrance_perc_cls=perc.rank(media_la_admitere)) %>%
      ungroup
    
    data_student$grad_perc_cls<-NA
    data_student[!is.na(data_student$media),]<-data_student[!is.na(data_student$media),] %>%
      group_by(an,school_harmonized,specializare_bac2) %>%
      mutate(grad_perc_cls=perc.rank(media)) %>%
      ungroup
    
    data_student$entrance_perc_scl<-NA
    data_student[!is.na(data_student$media_la_admitere),]<-data_student[!is.na(data_student$media_la_admitere),] %>%
      group_by(an,school_harmonized) %>%
      mutate(entrance_perc_scl=perc.rank(media_la_admitere)) %>%
      ungroup
    
    data_student$grad_perc_scl<-NA
    data_student[!is.na(data_student$media),]<-data_student[!is.na(data_student$media),] %>%
      group_by(an,school_harmonized) %>%
      mutate(grad_perc_scl=perc.rank(media)) %>%
      ungroup
    
    data_student$entrance_perc_town<-NA
    data_student[!is.na(data_student$media_la_admitere),]<-data_student[!is.na(data_student$media_la_admitere),] %>%
      group_by(an,town_hs_bac) %>%
      mutate(entrance_perc_town=perc.rank(media_la_admitere)) %>%
      ungroup
    
    data_student$grad_perc_town<-NA
    data_student[!is.na(data_student$media),]<-data_student[!is.na(data_student$media),] %>%
      group_by(an,town_hs_bac) %>%
      mutate(grad_perc_town=perc.rank(media)) %>%
      ungroup
    
    data_student$grad_perc_town<-NA
    data_student[!is.na(data_student$media),]<-data_student[!is.na(data_student$media),] %>%
      group_by(an,town_hs_bac) %>%
      mutate(grad_perc_town=perc.rank(media)) %>%
      ungroup
    
    ###########
    data_student$entrance_perc_town_exam<-NA
    data_student[!is.na(data_student$media_en_tsu),]<-data_student[!is.na(data_student$media_en_tsu),] %>%
      group_by(an,town_hs_bac) %>%
      mutate(entrance_perc_town_exam=perc.rank(media_en_tsu)) %>%
      ungroup
    
    data_student$entrance_perc_exam<-NA
    data_student[!is.na(data_student$media_en_tsu),]<-data_student[!is.na(data_student$media_en_tsu),] %>%
      group_by(an) %>%
      mutate(entrance_perc_exam=perc.rank(media_en_tsu)) %>%
      ungroup
    
    data_student<-data_student %>% mutate(media_de_absolvire=ifelse(an==2008,(media_la_admitere*2-media_en_tsu),media_de_absolvire))
    
    data_student<-as.data.frame(data_student)
    data_student$entrance_perc_town_gpa<-NA
    data_student[!is.na(data_student$media_de_absolvire),]<-data_student[!is.na(data_student$media_de_absolvire),] %>%
      group_by(an,town_hs_bac) %>%
      mutate(entrance_perc_town_gpa=perc.rank(media_de_absolvire)) %>%
      ungroup
    
    data_student<-as.data.frame(data_student)
    data_student$entrance_perc_gpa<-NA
    data_student[!is.na(data_student$media_de_absolvire),]<-data_student[!is.na(data_student$media_de_absolvire),] %>%
      group_by(an) %>%
      mutate(entrance_perc_gpa=perc.rank(media_de_absolvire)) %>%
      ungroup
    
    data_student$entrance_perc_ro<-NA
    data_student[!is.na(data_student$nota_lb_romana),]<-data_student[!is.na(data_student$nota_lb_romana),] %>%
      group_by(an) %>%
      mutate(entrance_perc_ro=perc.rank(nota_lb_romana)) %>%
      ungroup
    
    data_student$entrance_perc_math<-NA
    data_student[!is.na(data_student$nota_matematica),]<-data_student[!is.na(data_student$nota_matematica),] %>%
      group_by(an) %>%
      mutate(entrance_perc_math=perc.rank(nota_matematica)) %>%
      ungroup
    
    data_student$grad_perc_ro<-NA
    data_student[!is.na(data_student$lb_romana_final),]<-data_student[!is.na(data_student$lb_romana_final),] %>% 
      group_by(an) %>% 
      mutate(grad_perc_ro=perc.rank(lb_romana_final))
    
    
    ###########
    
    #Get number of schools per town
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,an) %>% mutate(n_hs_town=n_distinct(school_harmonized))
    #Get mean entrance grade and variance
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized,an,specializare_bac2) %>% 
      mutate(class_mean_yr=(sum(entrance_perc[!is.na(media_la_admitere)])-entrance_perc)/(sum(!is.na(media_la_admitere))-1))
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized,an) %>% 
      mutate(school_mean_yr=(sum(entrance_perc[!is.na(media_la_admitere)])-entrance_perc)/(sum(!is.na(media_la_admitere))-1))
    
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized,specializare_bac2) %>% 
      mutate(class_mean=(sum(entrance_perc[!is.na(media_la_admitere)])-entrance_perc)/(sum(!is.na(media_la_admitere))-1))
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized) %>% 
      mutate(school_mean=(sum(entrance_perc[!is.na(media_la_admitere)])-entrance_perc)/(sum(!is.na(media_la_admitere))-1))
    
    
    ###############
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized,an,specializare_bac2) %>% 
      mutate(class_mean_yr_grad=(sum(grad_perc[!is.na(media)])-grad_perc)/(sum(!is.na(media))-1))
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized,an) %>% 
      mutate(school_mean_yr_grad=(sum(grad_perc[!is.na(media)])-grad_perc)/(sum(!is.na(media))-1))
    
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized,specializare_bac2) %>% 
      mutate(class_mean_grad=(sum(grad_perc[!is.na(media)])-grad_perc)/(sum(!is.na(media))-1))
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,school_harmonized) %>% 
      mutate(school_mean_grad=(sum(grad_perc[!is.na(media)])-grad_perc)/(sum(!is.na(media))-1))
    
    
    
    # data_student<-data_student %>% mutate(q100_cls=ifelse(entrance_perc_cls>=0.75,1,0),
    #                       q75_cls=ifelse(entrance_perc_cls>=0.5 & entrance_perc_cls<0.75,1,0),
    #                       q50_cls=ifelse(entrance_perc_cls>=0.25 & entrance_perc_cls<0.5,1,0),
    #                       q25_cls=ifelse(entrance_perc_cls<0.25,1,0),
    #                       med100_cls=ifelse(entrance_perc_cls>=0.5,1,0),
    #                       med50_cls=ifelse(entrance_perc_cls<0.5,1,0))
    
    data_student<-data_student %>% mutate(quart_cls=cut(entrance_perc_cls, breaks = c(-Inf, 0.25,0.5,0.75, Inf), 
                                                        labels = c('1','2','3','4'), right = FALSE))
    data_student<-data_student %>% mutate(med_cls=cut(entrance_perc_cls, breaks = c(-Inf, 0.5, Inf), 
                                                      labels = c('Lower','Upper'), right = FALSE))
    data_student<-data_student %>% mutate(dec_cls=cut(entrance_perc_cls, breaks = c(-Inf, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, Inf), 
                                                      labels = c('1','2','3','4','5','6','7','8','9','10'), right = FALSE))
    data_student<-data_student %>% mutate(quart_scl=cut(entrance_perc_scl, breaks = c(-Inf, 0.25,0.5,0.75, Inf), 
                                                        labels = c('1','2','3','4'), right = FALSE))
    data_student<-data_student %>% mutate(med_scl=cut(entrance_perc_scl, breaks = c(-Inf, 0.5, Inf), 
                                                      labels = c('Lower','Upper'), right = FALSE))
    data_student<-data_student %>% mutate(dec_scl=cut(entrance_perc_scl, breaks = c(-Inf, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, Inf), 
                                                      labels = c('1','2','3','4','5','6','7','8','9','10'), right = FALSE))
    
    data_student<-data_student %>% mutate(quart=cut(entrance_perc, breaks = c(-Inf, 0.25,0.5,0.75, Inf), 
                                                    labels = c('1','2','3','4'), right = FALSE))
    data_student<-data_student %>% mutate(med=cut(entrance_perc, breaks = c(-Inf, 0.5, Inf), 
                                                  labels = c('Lower','Upper'), right = FALSE))
    data_student<-data_student %>% mutate(dec=cut(entrance_perc, breaks = c(-Inf, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, Inf), 
                                                  labels = c('1','2','3','4','5','6','7','8','9','10'), right = FALSE))
    data_student<-data_student %>% mutate(quart_town=cut(entrance_perc_town, breaks = c(-Inf, 0.25,0.5,0.75, Inf), 
                                                         labels = c('1','2','3','4'), right = FALSE))
    data_student<-data_student %>% mutate(med_town=cut(entrance_perc_town, breaks = c(-Inf, 0.5, Inf), 
                                                       labels = c('Lower','Upper'), right = FALSE))
    data_student<-data_student %>% mutate(dec_town=cut(entrance_perc_town, breaks = c(-Inf, 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, Inf), 
                                                       labels = c('1','2','3','4','5','6','7','8','9','10'), right = FALSE))
    
    
    
    
    #Get nuymber of schools per town
    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,an) %>% mutate(n_hs_town=n_distinct(school_harmonized))
    

    data_student<-data_student %>% group_by(judet_bac,town_hs_bac,an) %>% mutate(n_students_town_yr=n())
    

    data_student<-as.data.frame(data_student)
    data_student$n_hs_town_group<-data_student$n_hs_town
    data_student$n_hs_town_group[data_student$n_hs_town>=4 & data_student$n_hs_town<=15]<-"4-15"
    data_student$n_hs_town_group[data_student$n_hs_town>15]<-"16+"
    data_student$n_hs_town_group<-with(data_student, reorder(n_hs_town_group, n_hs_town))
    

  data_student<-data_student %>%
    mutate(id_adm=as.numeric(id_adm)+an*1000000,
           id_bac=as.numeric(id_bac)+an*1000000)

    
    ##add socioeconomic characteristics (and SIRUTA 2 codes for towns), obtained from Romanian INS as xlsx files and then aggregated using socioeconomic_stats.R
    county_stats<-readRDS('county_SES')
    town_stats<-readRDS('town_SES')
    
    #add county level stats
    data_student<-data_student %>%
      base::merge(county_stats,by.x=c("judet_bac"),by=("Judet"),suffixes=c("",""),all.x=T) %>%
      base::merge(county_stats,by.x=c("judet_adm"),by=("Judet"),suffixes=c("_hs_bac",""),all.x=T) %>%
      base::merge(county_stats,by.x=c("judet_ms"),by=("Judet"),suffixes=c("_hs_adm","_ms_adm"),all.x=T) 
    
    
    #add town level stats: unemployment
    # data_student<-data_siruta_bac
   
    data_student<-data_student %>%  
      base::merge(town_stats,by.x=c("Cod_SIRUTA2_hs_bac"),by.y=c("SIRUTA"),suffixes=c("",""),all.x=T)
    
    data_student<-data_student %>%  
     base::merge(town_stats,by.x=c("Cod_SIRUTA2_hs_adm"),by.y=c("SIRUTA"),suffixes=c("_hs_bac",""),all.x=T) 
      
    data_student<-data_student %>%  
      base::merge(town_stats,by.x=c("Cod_SIRUTA2_ms_adm"),by.y=c("SIRUTA"),suffixes=c("_hs_adm","_ms_adm"),all.x=T) 
      
  #add distances
  data_student<-data_student %>%
      rowwise %>%
      mutate(dist=distHaversine(c(lng.hs_adm,lat.hs_adm),c(lng.ms_adm,lat.ms_adm))) 
  
  
 
  setwd(wd_data_final)
  saveRDS(data_student,"data_student")
  setwd(wd)
  
  
  return(data_student)
}